{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "lang": "en"
   },
   "source": [
    "# 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词：\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "lang": "fr"
   },
   "source": [
    "# 分 词 （20 分） ： 由于 企业 描述 是 文本 信息 ， 需要 对 文本 信息 进行 特征 提取。 文本 分 词 可采用 Jieba 分 词 ："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:28:11.383327Z",
     "start_time": "2020-07-17T10:28:10.125627Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "import os \n",
    "import codecs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:28:11.487050Z",
     "start_time": "2020-07-17T10:28:11.384325Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"training.csv\", header = None, encoding = 'utf8')\n",
    "train.columns = ['label', 'text']\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:28:11.495032Z",
     "start_time": "2020-07-17T10:28:11.488050Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:28:11.506002Z",
     "start_time": "2020-07-17T10:28:11.498021Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4774 entries, 0 to 4773\n",
      "Data columns (total 2 columns):\n",
      "label    4774 non-null int64\n",
      "text     4774 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:28:11.668570Z",
     "start_time": "2020-07-17T10:28:11.510009Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAENCAYAAAAYIIIKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFQ9JREFUeJzt3X+0ZWV93/H3BxAVIvJjLpTMTBwSpyphRaGzKMpa1DpWfkgZopIFVp0ltEMSQkDTRrB/YONKq0kaRGNIqCBDQ0AC0pkqEVmosV0WdFCiwGCZIDI3jMyl/DKyoo5++8d5ZnG8XJi7587Z517m/VrrrLP3s5+zv89Z4v3MfvaPk6pCkqTZ2mPcA5AkLSwGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUid7jXsAo7Bo0aJatmzZuIchSQvKHXfc8UhVTeyo3/MyOJYtW8aGDRvGPQxJWlCSfHc2/ZyqkiR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR18ry8c3x395krThx5jZPP/OsZ2y/+y+NHWvc9b795pPuXtGMecUiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZGTBkeSKJFuT3DXU9odJ7k3yzSQ3Jtl/aNuFSTYl+XaS44faT2htm5JcMKrxSpJmZ5RHHFcCJ0xruwU4oqp+Bfi/wIUASQ4HTgd+uX3mT5PsmWRP4OPAicDhwBmtryRpTEYWHFX1ZeDRaW2fr6ptbfU2YElbXgVcW1U/rKrvAJuAo9trU1XdX1U/Aq5tfSVJYzLOcxxnAtufW7EY2Dy0bbK1PVv7MyRZk2RDkg1TU1MjGK4kCcYUHEn+I7ANuHp70wzd6jnan9lYdVlVraiqFRMTE7tmoJKkZ+j9IYdJVgMnAyuransITAJLh7otAR5qy8/WLkkag16POJKcALwPOKWqnhratB44PckLkxwGLAe+CnwNWJ7ksCR7MziBvr7PMUuSftbIjjiSXAO8HliUZBK4iMFVVC8EbkkCcFtV/XpV3Z3kOuAeBlNY51TVT9p+fgu4GdgTuKKq7h7VmCVJOzay4KiqM2Zovvw5+v8+8PsztN8E3LQLhyZJmgPvHJckdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqRODQ5LUyciCI8kVSbYmuWuo7cAktyS5r70f0NqT5KNJNiX5ZpKjhj6zuvW/L8nqUY1XkjQ7ozziuBI4YVrbBcCtVbUcuLWtA5wILG+vNcClMAga4CLgnwNHAxdtDxtJ0niMLDiq6svAo9OaVwFr2/Ja4NSh9qtq4DZg/ySHAscDt1TVo1X1GHALzwwjSVKP+j7HcUhVbQFo7we39sXA5qF+k63t2dolSWMyX06OZ4a2eo72Z+4gWZNkQ5INU1NTu3RwkqSn9R0cD7cpKNr71tY+CSwd6rcEeOg52p+hqi6rqhVVtWJiYmKXD1ySNNB3cKwHtl8ZtRpYN9T+rnZ11THAE20q62bgTUkOaCfF39TaJEljsteodpzkGuD1wKIkkwyujvoQcF2Ss4AHgdNa95uAk4BNwFPAuwGq6tEkHwS+1vr9XlVNP+EuSerRyIKjqs54lk0rZ+hbwDnPsp8rgCt24dAkSXMwX06OS5IWCINDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUidjCY4k70lyd5K7klyT5EVJDktye5L7knwqyd6t7wvb+qa2fdk4xixJGug9OJIsBn4bWFFVRwB7AqcDHwYurqrlwGPAWe0jZwGPVdXLgYtbP0nSmOw1xrovTvJjYB9gC/AG4O1t+1rgA8ClwKq2DHA98CdJUlXV54ClZ/PmG/58pPv/7FvPHun+pa56P+Koqr8H/gh4kEFgPAHcATxeVdtat0lgcVteDGxun93W+h80fb9J1iTZkGTD1NTUaL+EJO3GxjFVdQCDo4jDgJ8H9gVOnKHr9iOKPMe2pxuqLquqFVW1YmJiYlcNV5I0zThOjr8R+E5VTVXVj4FPA68D9k+yfepsCfBQW54ElgK07S8FHu13yJKk7cYRHA8CxyTZJ0mAlcA9wBeBt7U+q4F1bXl9W6dt/4LnNyRpfMZxjuN2Bie5vw58q43hMuB9wHuTbGJwDuPy9pHLgYNa+3uBC/oesyTpaWO5qqqqLgIumtZ8P3D0DH3/ETitj3FJknbMO8clSZ0YHJKkTgwOSVInBockqZNZBUeSW2fTJkl6/nvOq6qSvIjBs6QWtTu+t9/FvR+Du74lSbuZHV2OezZwPoOQuIOng+NJ4OMjHJckaZ56zuCoqkuAS5KcW1Uf62lMkqR5bFY3AFbVx5K8Dlg2/JmqumpE45IkzVOzCo4k/x34JeBO4CetuQCDQ5J2M7N95MgK4HAfLihJmu19HHcB/2SUA5EkLQyzPeJYBNyT5KvAD7c3VtUpIxmVJGnemm1wfGCUg5AkLRyzvarqb0Y9EEnSwjDbq6q+z9O/87038ALgB1W136gGJkman2Z7xPGS4fUkpzLDjy5Jkp7/durpuFX1P4A37OKxSJIWgNlOVb1laHUPBvd1eE+HJO2GZntV1b8eWt4GPACs2uWjkSTNe7M9x/HuUQ9EkrQwzPaHnJYkuTHJ1iQPJ7khyZJRD06SNP/M9uT4J4H1DH6XYzHwP1ubJGk3M9vgmKiqT1bVtva6EpjY2aJJ9k9yfZJ7k2xM8tokBya5Jcl97f2A1jdJPppkU5JvJjlqZ+tKkuZutsHxSJJ3JNmzvd4B/L851L0E+FxVvRJ4NbARuAC4taqWA7e2dYATgeXttQa4dA51JUlzNNvgOBP4NeB7wBbgbcBOnTBPsh9wHHA5QFX9qKoeZ3CV1trWbS1walteBVxVA7cB+yc5dGdqS5LmbrbB8UFgdVVNVNXBDILkAztZ8xeBKeCTSb6R5BNJ9gUOqaotAO394NZ/MbB56POTrU2SNAazDY5fqarHtq9U1aPAkTtZcy/gKODSqjoS+AFPT0vNJDO0PePmwyRrkmxIsmFqamonhyZJ2pHZBsce209WAyQ5kNnfPDjdJDBZVbe39esZBMnD26eg2vvWof5Lhz6/BHho+k6r6rKqWlFVKyYmdvq8vSRpB2YbHP8V+EqSDyb5PeArwB/sTMGq+h6wOckrWtNK4B4Gl/uubm2rgXVteT3wrnZ11THAE9untCRJ/ZvtneNXJdnA4MGGAd5SVffMoe65wNVJ9gbuZ3CifQ/guiRnAQ8Cp7W+NwEnAZuAp9jJk/KSpF1j1tNNLSjmEhbD+7qTwYMSp1s5Q98CztkVdSVJc7dTj1WXJO2+DA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnYwtOJLsmeQbST7T1g9LcnuS+5J8Ksnerf2FbX1T275sXGOWJI33iOM8YOPQ+oeBi6tqOfAYcFZrPwt4rKpeDlzc+kmSxmQswZFkCfBm4BNtPcAbgOtbl7XAqW15VVunbV/Z+kuSxmBcRxwfAX4X+GlbPwh4vKq2tfVJYHFbXgxsBmjbn2j9JUlj0HtwJDkZ2FpVdww3z9C1ZrFteL9rkmxIsmFqamoXjFSSNJNxHHEcC5yS5AHgWgZTVB8B9k+yV+uzBHioLU8CSwHa9pcCj07faVVdVlUrqmrFxMTEaL+BJO3Geg+OqrqwqpZU1TLgdOALVfVvgC8Cb2vdVgPr2vL6tk7b/oWqesYRhySpH/PpPo73Ae9NsonBOYzLW/vlwEGt/b3ABWManyQJ2GvHXUanqr4EfKkt3w8cPUOffwRO63VgkqRnNZ+OOCRJC4DBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZKyPHJG08065ft2OO83B+retGun+tXB5xCFJ6sTgkCR1YnBIkjoxOCRJnXhyXJJ24N4/fXjkNV75m4eMvMau4hGHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE68j0PPCyeu+/WR1/jrVX828hrSQtD7EUeSpUm+mGRjkruTnNfaD0xyS5L72vsBrT1JPppkU5JvJjmq7zFLkp42jqmqbcDvVNWrgGOAc5IcDlwA3FpVy4Fb2zrAicDy9loDXNr/kCVJ2/U+VVVVW4Atbfn7STYCi4FVwOtbt7XAl4D3tfarqqqA25Lsn+TQth9JY3DaDXeNdP9/9dYjRrp/zc1YT44nWQYcCdwOHLI9DNr7wa3bYmDz0McmW9v0fa1JsiHJhqmpqVEOW5J2a2MLjiQ/B9wAnF9VTz5X1xna6hkNVZdV1YqqWjExMbGrhilJmmYswZHkBQxC4+qq+nRrfjjJoW37ocDW1j4JLB36+BLgob7GKkn6WeO4qirA5cDGqvrjoU3rgdVteTWwbqj9Xe3qqmOAJzy/IUnjM477OI4F3gl8K8mdre39wIeA65KcBTwInNa23QScBGwCngLe3e9wJUnDxnFV1f9m5vMWACtn6F/AOSMdlCRp1rxzXNKCcd0Nj4x0/7/21kUj3f/zhc+qkiR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjrxkSMj9HcfWzXS/f/Suet23EnSgvbwR7460v0fcv7RnT/jEYckqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ087y/Hnbr0L0ZeY+I33jHyGpI0X3jEIUnqxOCQJHVicEiSOjE4JEmdLJjgSHJCkm8n2ZTkgnGPR5J2VwsiOJLsCXwcOBE4HDgjyeHjHZUk7Z4WRHAARwObqur+qvoRcC0w2kfPSpJmtFCCYzGweWh9srVJknqWqhr3GHYoyWnA8VX1b9v6O4Gjq+rcoT5rgDVt9RXAt+dQchHwyBw+v9DqjrP27lZ3nLX9zrtH7bnUfVlVTeyo00K5c3wSWDq0vgR4aLhDVV0GXLYriiXZUFUrdsW+FkLdcdbe3eqOs7bfefeo3UfdhTJV9TVgeZLDkuwNnA6sH/OYJGm3tCCOOKpqW5LfAm4G9gSuqKq7xzwsSdotLYjgAKiqm4Cbeiq3S6a8FlDdcdbe3eqOs7bfefeoPfK6C+LkuCRp/lgo5zgkSfOEwTEkyRVJtia5q+e6S5N8McnGJHcnOa+nui9K8tUkf9vq/qc+6g7V3zPJN5J8pue6DyT5VpI7k2zose7+Sa5Pcm/73/q1PdV9Rfuu219PJjm/p9rvaf9t3ZXkmiQv6qNuq31eq3v3KL/vTH83khyY5JYk97X3A3qsfVr7zj9NMpKrqwyOn3UlcMIY6m4DfqeqXgUcA5zT0yNVfgi8oapeDbwGOCHJMT3U3e48YGOP9Yb9y6p6Tc+XS14CfK6qXgm8mp6+e1V9u33X1wD/DHgKuHHUdZMsBn4bWFFVRzC4sOX0UddttY8A/h2Dp068Gjg5yfIRlbuSZ/7duAC4taqWA7e29b5q3wW8BfjyiGoaHMOq6svAo2Oou6Wqvt6Wv8/gD8rI74yvgX9oqy9or15OeiVZArwZ+EQf9cYtyX7AccDlAFX1o6p6fAxDWQn8XVV9t6d6ewEvTrIXsA/T7r8aoVcBt1XVU1W1Dfgb4FdHUehZ/m6sAta25bXAqX3VrqqNVTWXG6B3yOCYZ5IsA44Ebu+p3p5J7gS2ArdUVS91gY8Avwv8tKd6wwr4fJI72hMH+vCLwBTwyTY994kk+/ZUe9jpwDV9FKqqvwf+CHgQ2AI8UVWf76M2g391H5fkoCT7ACfxszcRj9ohVbUFBv8wBA7usfbIGRzzSJKfA24Azq+qJ/uoWVU/aVMYS4Cj2yH+SCU5GdhaVXeMutazOLaqjmLwtOVzkhzXQ829gKOAS6vqSOAHjG76Ykbt5tlTgL/qqd4BDP7lfRjw88C+Sd7RR+2q2gh8GLgF+BzwtwymhLULGBzzRJIXMAiNq6vq033Xb9MmX6KfczzHAqckeYDBk47fkOQveqgLQFU91N63MpjrP7qHspPA5NAR3fUMgqRPJwJfr6qHe6r3RuA7VTVVVT8GPg28rqfaVNXlVXVUVR3HYDrnvr5qAw8nORSgvW/tsfbIGRzzQJIwmPveWFV/3GPdiST7t+UXM/g/+r2jrltVF1bVkqpaxmDq5AtV1cu/RJPsm+Ql25eBNzGY1hipqvoesDnJK1rTSuCeUded5gx6mqZqHgSOSbJP+298JT1eDJHk4Pb+CwxOFvf53dcDq9vyamBdj7VHbsHcOd6HJNcArwcWJZkELqqqy3sofSzwTuBb7XwDwPvb3fKjdCiwtv1Q1h7AdVXV66WxY3AIcOPg7xh7AX9ZVZ/rqfa5wNVtyuh+4N091aXN8/8r4Oy+albV7UmuB77OYJroG/R7N/UNSQ4CfgycU1WPjaLITH83gA8B1yU5i0GAntZj7UeBjwETwGeT3FlVx+/Sut45LknqwqkqSVInBockqRODQ5LUicEhSerE4JAkdWJwSDupPWX3jbPoV0levpM1dvqz0qgYHJKkTgwOSVInBoc0R0mOTvJ/kjyeZEuSP2l3hw87Kcn9SR5J8odJ9hj6/Jnth50eS3Jzkpf1/BWkTgwOae5+ArwHWAS8lsEzmX5zWp9fBVYweLDhKuBMgCSnAu9n8CylCeB/0e8zlaTODA5pjqrqjqq6raq2VdUDwJ8D/2Jatw9X1aNV9SCD3yI5o7WfDfyX9uM724D/DLzGow7NZwaHNEdJ/mmSzyT5XpInGfzxXzSt2+ah5e8y+H0KgJcBl7RprscZPKAu9PALkNLOMjikubuUwePol1fVfgymnjKtz/Cvz/0CT/+E6mbg7Kraf+j14qr6yshHLe0kg0Oau5cATwL/kOSVwG/M0Oc/JDkgyVLgPOBTrf3PgAuT/DJAkpcmGckjuKVdxeCQ5u7fA28Hvg/8N54OhWHrgDuAO4HPMvjhLqrqRgY/cXptm+a6i8Ev9Unzlr/HIUnqxCMOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVIn/x/1aMs6XUFhIwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig = plt.figure()\n",
    "sns.countplot(train.label.values)\n",
    "plt.xlabel('label', fontsize = 12)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如上图，根据标签分类的话，一共分为11类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词： "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T07:44:55.361656Z",
     "start_time": "2020-07-17T07:44:53.912478Z"
    },
    "lang": "en"
   },
   "source": [
    "# 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词： "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取停用词表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:55:15.063248Z",
     "start_time": "2020-07-17T10:55:15.057277Z"
    }
   },
   "outputs": [],
   "source": [
    "#在stopwords.txt文件中，由于是 每行一个词，所以打开文件后，循环每一行以回车为分隔符\n",
    "def read_file(file_path):\n",
    "    f = codecs.open(file_path, mode = 'r', encoding = 'utf-8')\n",
    "    lines = []\n",
    "    for line in f:\n",
    "        line = line.rstrip('\\n').rstrip('\\r')\n",
    "        lines.append(line)\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:55:17.414585Z",
     "start_time": "2020-07-17T10:55:17.407605Z"
    }
   },
   "outputs": [],
   "source": [
    "stopwords = read_file('stopwords.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:55:19.561691Z",
     "start_time": "2020-07-17T10:55:19.557715Z"
    }
   },
   "outputs": [],
   "source": [
    " import jieba as jb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T10:57:50.245243Z",
     "start_time": "2020-07-17T10:57:50.241252Z"
    }
   },
   "outputs": [],
   "source": [
    "#对比训练数据中的词与停用词表里的词，保留停用词表里没有的词，并且用空格分开\n",
    "def segment_text(each_row):\n",
    "    return ' '.join([word for word in jb.lcut(each_row['text']) if word not in stopwords])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T11:09:17.090810Z",
     "start_time": "2020-07-17T11:08:58.220455Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\郭国庆\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.738 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "train['text_segmentation'] = train.apply(segment_text, axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T11:09:25.481536Z",
     "start_time": "2020-07-17T11:09:25.470566Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "      <th>text_segmentation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "      <td>合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "      <td>公司 主营业务 微 企业 个体 工商户 农户 客户 提供 贷款 服务 设立 主营业务 未 发...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "      <td>公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "      <td>公司 工商管理 部门 核准 经营范围 投资 咨询 经济 信息 咨询 企业 管理 咨询 品牌 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "      <td>公司 主营业务 中国 境内 港 澳 台 保险代理 销售 依托 产品 研究 能力 专业化 服务...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text  \\\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...   \n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。   \n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...   \n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...   \n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...   \n",
       "\n",
       "                                   text_segmentation  \n",
       "0  合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 ...  \n",
       "1  公司 主营业务 微 企业 个体 工商户 农户 客户 提供 贷款 服务 设立 主营业务 未 发...  \n",
       "2  公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...  \n",
       "3  公司 工商管理 部门 核准 经营范围 投资 咨询 经济 信息 咨询 企业 管理 咨询 品牌 ...  \n",
       "4  公司 主营业务 中国 境内 港 澳 台 保险代理 销售 依托 产品 研究 能力 专业化 服务...  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "tfidf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T07:14:02.439716Z",
     "start_time": "2020-07-17T07:14:02.435719Z"
    }
   },
   "source": [
    "# 特征提取（20分）： 去掉停用词后（stopwords.txt），采用TFIDF作为每个文本的特征描述。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T11:20:23.537803Z",
     "start_time": "2020-07-17T11:20:23.533813Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import  TfidfVectorizer\n",
    "tfidf = TfidfVectorizer(min_df = 5)#min_df = 5表示所统计的词至少要出现5次"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T11:21:48.380004Z",
     "start_time": "2020-07-17T11:21:47.924508Z"
    }
   },
   "outputs": [],
   "source": [
    "#输出系数矩阵\n",
    "train_tfidf = tfidf.fit_transform(train['text_segmentation']).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T11:33:24.848438Z",
     "start_time": "2020-07-17T11:33:24.827529Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>00</th>\n",
       "      <th>000</th>\n",
       "      <th>002</th>\n",
       "      <th>004</th>\n",
       "      <th>008</th>\n",
       "      <th>01</th>\n",
       "      <th>02</th>\n",
       "      <th>022</th>\n",
       "      <th>028</th>\n",
       "      <th>03</th>\n",
       "      <th>...</th>\n",
       "      <th>黑色金属</th>\n",
       "      <th>黑龙江</th>\n",
       "      <th>黑龙江省</th>\n",
       "      <th>鼓励</th>\n",
       "      <th>齐全</th>\n",
       "      <th>齿轮</th>\n",
       "      <th>齿轮箱</th>\n",
       "      <th>龙头</th>\n",
       "      <th>龙头企业</th>\n",
       "      <th>龙门</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5826 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    00  000  002  004  008   01   02  022  028   03 ...   黑色金属  黑龙江  黑龙江省  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...    0.0  0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...    0.0  0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...    0.0  0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...    0.0  0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...    0.0  0.0   0.0   \n",
       "\n",
       "    鼓励   齐全   齿轮  齿轮箱   龙头  龙头企业   龙门  \n",
       "0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  \n",
       "1  0.0  0.0  0.0  0.0  0.0   0.0  0.0  \n",
       "2  0.0  0.0  0.0  0.0  0.0   0.0  0.0  \n",
       "3  0.0  0.0  0.0  0.0  0.0   0.0  0.0  \n",
       "4  0.0  0.0  0.0  0.0  0.0   0.0  0.0  \n",
       "\n",
       "[5 rows x 5826 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_tfidf = pd.DataFrame(columns = tfidf.get_feature_names(), data = train_tfidf)\n",
    "df_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采用KMeans聚类算法，根据第2 步得到特征对企业进行聚类， 尝试K=5，10，15，20，30，..., 50, 并选择合适的度量指标，选择最佳的K。（60分） "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:05:15.321213Z",
     "start_time": "2020-07-17T12:05:15.316230Z"
    }
   },
   "outputs": [],
   "source": [
    "#导入相应的工具包\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import KMeans#采用K均值聚类\n",
    "from sklearn import metrics#评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:05:17.321157Z",
     "start_time": "2020-07-17T12:05:17.318155Z"
    }
   },
   "outputs": [],
   "source": [
    "x_train = df_train_tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:05:19.515578Z",
     "start_time": "2020-07-17T12:05:19.401850Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       ..., \n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalize(x_train, norm=\"l2\", copy=False)# 因为要计算样本之间的距离,对每个样本数据进行归一化，每个样本的模长为1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:05:23.120178Z",
     "start_time": "2020-07-17T12:05:22.943661Z"
    }
   },
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
    "x_train = csr_matrix(x_train)#由于数据是稀疏的所以采用稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:06:09.415277Z",
     "start_time": "2020-07-17T12:06:09.410289Z"
    }
   },
   "outputs": [],
   "source": [
    "#一个参数点聚类数据为K的模型\n",
    "def k_cluster_analysis(k, x):\n",
    "    print(\"K-means begin with clusters: {}\".format(k))\n",
    "    \n",
    "    #k-means,在训练集上训练\n",
    "    mb_kmeans = KMeans(n_clusters = k)\n",
    "    y_pred = mb_kmeans.fit_predict(x)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(x.todense(), y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:15:35.711291Z",
     "start_time": "2020-07-17T12:06:11.809270Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 39.60332583262302\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 29.16848405801435\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 23.71926273733945\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 19.902787964308526\n",
      "K-means begin with clusters: 25\n",
      "CH_score: 17.5723011184242\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 15.91708825409405\n",
      "K-means begin with clusters: 35\n",
      "CH_score: 14.83313574050492\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 13.729513708624085\n",
      "K-means begin with clusters: 45\n",
      "CH_score: 13.054792895652687\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 12.382132187464887\n"
     ]
    }
   ],
   "source": [
    "\n",
    "#设置超参数（聚类数目K）的搜索范围\n",
    "ks = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]\n",
    "CH_scores = []\n",
    "for k in ks:\n",
    "    ch = k_cluster_analysis(k, x_train)#调用过程\n",
    "    CH_scores.append(ch)#评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-17T12:15:48.383728Z",
     "start_time": "2020-07-17T12:15:48.293872Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAG19JREFUeJzt3Xt4VdWZx/HvG66iIAIBowFBwAtPq1AjokAHqbd6xYrVDrZ4haJQBdqx1Y5WRx2dGZWpLToIWnTwfoNxdCoPhREoRYOiYqmCCEJFiaMUsQMSeOePdTIJGMhJOOesc/b5fZ5nPzln752cl/08+WWx9lprm7sjIiKFryR2ASIikhkKdBGRhFCgi4gkhAJdRCQhFOgiIgmhQBcRSQgFuohIQijQRUQSQoEuIpIQzXP5YZ06dfLu3bvn8iNFRArekiVLPnH30obOy2mgd+/encrKylx+pIhIwTOzNemcpy4XEZGEUKCLiCRE2oFuZs3M7HUzez71voeZLTazFWb2uJm1zF6ZIiLSkMa00K8Gltd5fwdwt7v3Bj4DLstkYSIi0jhpBbqZlQNnAFNT7w0YCjyVOmU6MCwbBYqISHrSbaFPAv4O2JF63xHY6O7VqffrgIPr+0YzG2VmlWZWWVVVtVfFiojI7jUY6GZ2JrDB3ZfU3V3PqfU++sjdp7h7hbtXlJY2OIxSRESaKJ0W+kDgbDNbDTxG6GqZBLQ3s5px7OXAh1mpEJgzB26/PVs/XUQkGRoMdHf/mbuXu3t34ELgd+4+ApgLDE+dNhKYma0i/+u/4Oc/hzVpDa0XESlOezMO/VpggpmtJPSpT8tMSV81blz4es892foEEZHC16hAd/d57n5m6vUqd+/v7r3c/Xx335qdEqFbN/jud+H++2HTpmx9iohIYSuYmaITJoQwn5a1/weIiBS2ggn0igoYPBj+9V+hurrh80VEik3BBDqEVvqaNfDss7ErERHJPwUV6GedBb16wZ13gtc76l1EpHgVVKA3awbXXAOLF8OiRbGrERHJLwUV6AAXXwwHHAB33RW7EhGR/FJwgb7vvjB6dOhHX7UqdjUiIvmj4AIdwkSjZs3CiBcREQkKMtAPOgguvDCMSd+4MXY1IiL5oSADHWD8ePjiizB7VERECjjQ+/WDE0+EX/4Stm2LXY2ISHwFG+gAEyfCunXw5JOxKxERia+gA/3b34bDD9dEIxERKPBALykJfemvvQbz58euRkQkroIOdIDvfx86dtREIxGRgg/0Nm3gyith1ixYsSJ2NSIi8RR8oEMI9BYtYNKk2JWIiMSTiEA/8EAYMQIefBA+/TR2NSIicSQi0CHcHP3f/4X77otdiYhIHIkJ9K9/HU45JTxIemvWnm4qIpK/EhPoEJ5o9NFH8PjjsSsREcm9RAX6KadAnz5hCKMmGolIsUlUoJuFVvobb8Dvfhe7GhGR3EpUoEMY7dK5syYaiUjxSVygt24NV10FL7wAy5fHrkZEJHcSF+gAY8ZAq1aaaCQixSWRgV5aCj/4ATz0EFRVxa5GRCQ3Ggx0M2ttZq+Y2Rtm9raZ3ZTa/xsze9/Mlqa2vtkvN33jx8OWLXDvvbErERHJjXRa6FuBoe5+NNAXOM3MBqSO/cTd+6a2pVmrsgmOPBJOPx1+/esQ7CIiSddgoHuwOfW2RWoriFHeEybAhg3wyCOxKxERyb60+tDNrJmZLQU2ALPdfXHq0K1m9qaZ3W1mrbJWZRMNHQpHH62JRiJSHNIKdHff7u59gXKgv5l9DfgZcARwLNABuLa+7zWzUWZWaWaVVTm+Q1kz0ejtt+Gll3L60SIiOdeoUS7uvhGYB5zm7utT3TFbgQeB/rv5ninuXuHuFaWlpXtdcGNdeCGUlWmikYgkXzqjXErNrH3q9T7AScCfzKwstc+AYcCybBbaVC1bwtixoYW+LC8rFBHJjHRa6GXAXDN7E3iV0If+PDDDzN4C3gI6Abdkr8y988MfhkfVqZUuIknWvKET3P1NoF89+4dmpaIs6NABLr4Ypk6F224LTzgSEUmaRM4Urc/VV8O2bTB5cuxKRESyo2gC/bDD4KyzQqD/9a+xqxERybyiCXSAiRPhf/4HHn44diUiIplXVIE+eDAccwzcfTfs2BG7GhGRzCqqQK+ZaPTOO/Dii7GrERHJrKIKdIDzz4fycrjzztiViIhkVtEFeosW8KMfwdy58PrrsasREcmcogt0gCuugH33DX3pIiJJUZSB3r49XHYZPPoo/PnPsasREcmMogx0CBONduyAX/0qdiUiIplRtIF+6KFw7rlw332weXPD54uI5LuiDXQIQxg3boTp02NXIiKy94o60I8/Ho47DiZNgu3bY1cjIrJ3ijrQayYarVwJ//EfsasREdk7RR3oAN/5DhxyiNZKF5HCV/SB3rx5GPEyfz68+mrsakREmq7oAx3CmPS2bTXRSEQKmwIdaNcuzB594gn44IPY1YiINI0CPeVHPwpf77knbh0iIk2lQE855BAYPhymTIHPP49djYhI4ynQ65gwATZtggceiF2JiEjjKdDr6N8fBg4ME42qq2NXIyLSOAr0XUycCKtXw3PPxa5ERKRxFOi7OPvssHCXJhqJSKFRoO+iWTO45hpYtChsIiKFQoFej0suCQ/BUCtdRAqJAr0e++0Ho0fDM8/A++/HrkZEJD0NBrqZtTazV8zsDTN728xuSu3vYWaLzWyFmT1uZi2zX27ujB0LJSXwy1/GrkREJD3ptNC3AkPd/WigL3CamQ0A7gDudvfewGfAZdkrM/fKy+GCC2Dq1PAQDBGRfNdgoHtQ85C2FqnNgaHAU6n904FhWakwogkTwuPppk6NXYmISMPS6kM3s2ZmthTYAMwG3gM2unvN9Jt1wMHZKTGeb3wDhgwJ3S7btsWuRkRkz9IKdHff7u59gXKgP3BkfafV971mNsrMKs2ssqqqqumVRjJhAqxdC08/HbsSEZE9a9QoF3ffCMwDBgDtzax56lA58OFuvmeKu1e4e0Vpaene1BrFGWdA795w553g9f7JEhHJD+mMcik1s/ap1/sAJwHLgbnA8NRpI4GZ2SoyppISGD8eKithwYLY1YiI7F46LfQyYK6ZvQm8Csx29+eBa4EJZrYS6AhMy16ZcY0cCR06aKKRiOS35g2d4O5vAv3q2b+K0J+eeG3awJgxcNttsHIl9OoVuyIRka/STNE0XXVVeKD07bfHrkREpH4K9DSVlcG4cTBtmsali0h+UqA3wh13wKmnhu6XOXNiVyMisjMFeiM0bw5PPAFHHAHnnQfLl8euSESklgK9kdq1g+efh9atwxj1DRtiVyQiEijQm+CQQ2DWLPjoIxg2DLZsiV2RiIgCvcn694eHHw5PNbr4YtixI3ZFIlLsFOh74bzzwo3Sxx+HG26IXY2IFLsGJxbJnv3kJ7BiBdx6a1jzZeTI2BWJSLFSoO8lM5g8OTyq7oorQv/6kCGxqxKRYqQulwxo0QKeeiosCfCd78A778SuSESKkQI9Q9q3h//8zzBW/Ywz4JNPYlckIsVGgZ5BPXrAzJmwbh2cey5s3Rq7IhEpJgr0DDv+eJg+PaydfvnleiiGiOSObopmwQUXhGV2f/7z0K9+442xKxKRYqBAz5LrrgvDGX/xixDqI0bErkhEkk6BniVmMGUKrF4Nl14K3brB4MGxqxKRJFMfeha1bAnPPAPdu4ebpCtXxq5IRJJMgZ5lHTqE4YwQhjN++mncekQkuRToOdCrFzz3XOh+Oe88+PLL2BWJSBIp0HNk0CB44AGYNw9GjdJwRhHJPN0UzaERI8LIl5tugsMOCyNhREQyRYGeYzfeGG6OXn899OwZxqyLiGSCulxyzAymTQtdMCNHhgdkiIhkggI9glat4NlnobwczjkHVq2KXZGIJIECPZJOncJwxurqMJxx48bYFYlIoVOgR3T44WHi0XvvwfDhsG1b7IpEpJAp0CMbMgTuvx/mzIErr9RwRhFpugYD3cy6mtlcM1tuZm+b2dWp/b8wsz+b2dLUdnr2y02mkSPDqJepU+Gf/zl2NSJSqNIZtlgNTHT318ysLbDEzGanjt3t7v+SvfKKx803h+GM114bhjOed17sikSk0DQY6O6+Hlifev25mS0HDs52YcWmpAQefBDWrIGLLoKuXaF//9hViUghaVQfupl1B/oBi1O7xprZm2b2gJkdsJvvGWVmlWZWWVVVtVfFJt0++4RH2JWVwdlnh3AXEUlX2oFuZvsBTwPXuPsm4F6gJ9CX0IK/s77vc/cp7l7h7hWlpaUZKDnZOneG55+HLVvgzDPhL3+JXZGIFIq0At3MWhDCfIa7PwPg7h+7+3Z33wHcD6iDIEP69IGnnoI//SksDVBdHbsiESkE6YxyMWAasNzd76qzv6zOaecCyzJfXvE66SS491747W9h3DgNZxSRhqUzymUg8H3gLTNbmtp3HfA9M+sLOLAaGJ2VCovY5ZeH1Rn/6Z/C6ozjx8euSETyWTqjXBYAVs+hFzJfjuzqH/8xDGecOBEOPTSs/SIiUh/NFM1zJSXw8MNQUQF/+7ewZEnsikQkXynQC0CbNjBrVljQ66yzYO3a2BWJSD5SoBeIAw8MqzNu3hxC/fPPY1ckIvlGgV5AvvY1ePJJWLYsjIL58MPYFYlIPlGgF5hTTw1j1N9+G449Fl59NXZFIpIvFOgFaNgw+P3voWVLGDwYZsyIXZGI5AMFeoE66qjQOh8wICzmde21sH177KpEJCYFegHr1Almz4YxY8Lko7PP1tovIsVMgV7gWrSAyZPD9tJLocW+YkXsqkQkBgV6QowZE1rrVVVhHfXZsxv+HhFJFgV6ggwZEvrVu3aF006DSZO0qJdIMVGgJ0yPHmEEzDnnhMW8LrsMtm6NXZWI5IICPYH22y+MVb/hhvBYuxNPhI8+il2ViGSbAj2hSkrgppvCzNI33giTkLSwl0iyKdATbvhwWLgwBPygQfDYY7ErEpFsUaAXgb59w83Sigr43vfg+uthx47YVYlIpinQi0TnzjBnTngK0m23wbnnwqZNsasSkUxSoBeRli1hyhS4556wFO8JJ8B778WuSkQyRYFeZMxg7Ngwq3T9+jAJac6c2FWJSCYo0IvU0KHwyivhwRmnngq/+pUmIYkUOgV6EevZExYtgtNPh3HjYPRo+PLL2FWJSFMp0Itcu3bw3HNh5Mv998O3vgUbNsSuSkSaQoEulJTALbfAo4+GyUfHHgtLl8auSkQaS4Eu/+/CC2H+/DBGfeDAsHyAiBQOBbrs5JhjwiSko4+G88+HG2/UJCSRQqFAl6848ECYOxcuuQRuvjksH7B5c+yqRKQhCnSpV6tWMG1aWFN95swwCen992NXJSJ70mCgm1lXM5trZsvN7G0zuzq1v4OZzTazFamvB2S/XMklM7j6anjxRVi7NtwsnTcvdlUisjvptNCrgYnufiQwALjKzPoAPwXmuHtvYE7qvSTQKaeESUilpXDyyXDffbErEpH6NBjo7r7e3V9Lvf4cWA4cDJwDTE+dNh0Ylq0iJb7eveEPfwjhPmYMXHklbNsWuyoRqatRfehm1h3oBywGurj7egihD3TOdHGSX/bfH2bNgmuvhXvvDa31NWtiVyUiNdIOdDPbD3gauMbd01541cxGmVmlmVVWVVU1pUbJI82awe23w7//OyxeHFruP/yhgl0kH6QV6GbWghDmM9z9mdTuj82sLHW8DKh3wri7T3H3CnevKC0tzUTNkgdGjIB334UrrgjPLVWwi8SXzigXA6YBy939rjqHZgEjU69HAjMzX57ks65d4de/hpUrFewi+SCdFvpA4PvAUDNbmtpOB24HTjazFcDJqfdShBTsIvnBPIeLYFdUVHhlZWXOPk/iWLs29LNPnRrWWL/0UrjuOujWLXZlIoXJzJa4e0VD52mmqGRcfS32Xr3CcMcPPohdnUhyKdAla+oG++WXh6UEFOwi2aNAl6zr2hUmTw4PpFawi2SPAl1ypibY1WIXyQ4FuuRct24KdpFsUKBLNHWD/bLLaoP9yivDSBkRaRwFukTXrVtYG6Ym2KdOhZ49FewijaVAl7yhYBfZOwp0yTsKdpGmUaBL3qoJ9hUrwmxTBbvIninQJe8dckh4SlLdYO/VC666SsEuUpcCXQpG3WC/5BK4//7a4Y6vvRbWjREpZgp0KTi7Bvu0aXDMMdCnD9xyC6xaFbtCkTgU6FKwaoL9o4/g3/4NOneGv//70M9+wglhHRk9JEuKiQJdCl6HDjBqFPz3f4f112+/HTZvhrFjoawMzjgDHnkEvvgidqUi2aVAl0Tp1i08xPrNN8P24x/DW2+FR+Z16QIXXQQvvgjV1bErFck8Bbok1te/Hlrrq1eH1vuIEfDCC3D66XDQQTBuHPzhD7qZKsmhQJfEKymBb34z9LOvXw/PPQdDhoThj8cfHx6Xd8MN8M47sSsV2TsKdCkqrVrBOefAE0/Axx+Hpyn16AG33gpHHAEVFXD33SH4RQqNAl2KVrt2cPHFMHs2rFsHd90V9k+YAOXlcPLJ8JvfwKZNMasUSZ8CXYQwGmb8eKishOXL4frrw3j2Sy4JN1O/+12YORO+/DJ2pSK7p0AX2cURR8DNN4fFwRYtCg/hmDcPhg2DAw+E0aPh5Zdhx47YlYrsTIEushtmMGAA3HMP/PnPtSNkZsyAv/mb0Pf+05+GYZEi+cA8h2O2KioqvLKyMmefJ5INX3wRul9mzIDf/ha2bw9DJC+4IAR9RQW0bh27SkkSM1vi7hUNnqdAF2m6qqowYmbGjNA9A9CyZQj1gQNh0KCwDEGnTnHrlMKmQBfJsU8+gd//HhYuhAULwg3WmpuoRxwRwr0m5Hv2DF06IulQoItEtmVLCPUFC0LIL1wIn30WjnXpEsK9JuD79YMWLeLWK/kr3UBvnsYPegA4E9jg7l9L7fsFcAVQs5bdde7+QtPLFUme1q1DWA8aFN7v2BGGRNa04BcuhGeeCcf22QeOO662FX/88bD//vFql8LUYAvdzL4JbAYe2iXQN7v7vzTmw9RCF9nZ+vW1Ab9gASxdGm6ymoUbrTV/EAYODAuPSXHKWAvd3V82s+6ZKEpEdlZWBsOHhw3Csr+LF9eG/EMPweTJ4VjXrrVdNAMHhsBv1ixe7ZJ/Ggz0PRhrZj8AKoGJ7v5ZhmoSKVr77Qff+lbYICzz+9ZbtV00L78Mjz0WjrVtG7pmalrx/fvDvvvGq13iS+umaKqF/nydLpcuwCeAA/8AlLn7pbv53lHAKIBu3bods2bNmowULlKM3MNDPOr2wy9bFvY3awbf+EZtwA8aFJ7iJIUvo6Ncdg30dI/tSn3oIpm3cWMYA1/TD//KK2GEDcBhh8HgwbVbjx4aLlmIMtaHvpsfXubuNQuMngssa8rPEZG91749fPvbYQPYuhWWLAnhPn8+PP10eJA2hD77mnAfNEj98EmTziiXR4EhQCfgY+DG1Pu+hC6X1cDoOgG/W2qhi+Tejh3wxz+GcK/Z1q0Lx9q1q73ROngwHHusli3IR5pYJCK7tWZNCPaaVvwf/xj2t2wZQr2mFX/CCeF/ABKXAl1E0vbJJ7U3WufPD1021dW14+FrumgGD4aDD45dbfFRoItIk33xRbi5WtNFs2hR2AfhxmpNuA8eDIcfrhut2aZAF5GMqa4Os1jrdtNUpRb+6NRp54Dv21fr0mSaAl1EssYd3n23Ntznzw+P7IMwuWnAgLDgWO/eYehk795w0EFqyTeVAl1EcurDD2sDfsGCsBDZ1q21x9u0gV69QrjvunXporDfEwW6iES1fTusXQsrVnx1W7UqdOPUaNs2hH1Na77u1rGjwj6rE4tERBrSrBl07x62k0/e+Vh1dRg6uWvQV1bCk0/u/ADu9u3rb9X37g0HHJDLf1H+UwtdRPLKl1/C++/X37L/4IPQf1+jY8f6W/W9e4dWf1KohS4iBallyzAU8vDDv3psy5bQXVM35N99F+bMCUsN19WlS/1B36tXclelVKCLSMFo3Rr69Anbrv76V1i58qut+hdfhAcf3Pncgw6qP+x79gxPjypUCnQRSYQ2beCoo8K2q88/rz/sZ86sHU8P4eZreXn9YX/oodCqVe7+PU2hQBeRxGvbNoyL79fvq8f+8pf6++uffBI+/bT2vJKS8BjA+sK+R4/8mEylQBeRorb//lBREbZdffpp/WE/Y0b4Q1CjZkTPrn31vXuH/c1zlLQKdBGR3ejQAY47Lmx1uYcFzeqGfE2XzsKFoYunRvPmoQU/ZQoMGZLdehXoIiKNZAalpWE74YSdj7nDhg1fbdV36pT9uhToIiIZZBaGTHbpEhYty6WS3H6ciIhkiwJdRCQhFOgiIgmhQBcRSQgFuohIQijQRUQSQoEuIpIQCnQRkYTI6QMuzKwKWJOzD8yOTsAnsYvII7oetXQtdqbrsbO9uR6HuHtpQyflNNCTwMwq03lySLHQ9aila7EzXY+d5eJ6qMtFRCQhFOgiIgmhQG+8KbELyDO6HrV0LXam67GzrF8P9aGLiCSEWugiIgmhQN8DM3vAzDaY2bI6+zqY2WwzW5H6ekDMGnPFzLqa2VwzW25mb5vZ1an9xXo9WpvZK2b2Rup63JTa38PMFqeux+Nm1jJ2rbliZs3M7HUzez71vpivxWoze8vMlppZZWpf1n9XFOh79hvgtF32/RSY4+69gTmp98WgGpjo7kcCA4CrzKwPxXs9tgJD3f1ooC9wmpkNAO4A7k5dj8+AyyLWmGtXA8vrvC/mawFworv3rTNUMeu/Kwr0PXD3l4FPd9l9DjA99Xo6MCynRUXi7uvd/bXU688Jv7gHU7zXw919c+pti9TmwFDgqdT+orkeZlYOnAFMTb03ivRa7EHWf1cU6I3Xxd3XQwg5oHPkenLOzLoD/YDFFPH1SHUxLAU2ALOB94CN7l6dOmUd4Y9eMZgE/B2wI/W+I8V7LSD8cX/JzJaY2ajUvqz/ruiZotIoZrYf8DRwjbtvCg2x4uTu24G+ZtYeeBY4sr7TcltV7pnZmcAGd19iZkNqdtdzauKvRR0D3f1DM+sMzDazP+XiQ9VCb7yPzawMIPV1Q+R6csbMWhDCfIa7P5PaXbTXo4a7bwTmEe4ttDezmoZSOfBhrLpyaCBwtpmtBh4jdLVMojivBQDu/mHq6wbCH/v+5OB3RYHeeLOAkanXI4GZEWvJmVSf6DRgubvfVedQsV6P0lTLHDPbBziJcF9hLjA8dVpRXA93/5m7l7t7d+BC4HfuPoIivBYAZravmbWteQ2cAiwjB78rmli0B2b2KDCEsErax8CNwHPAE0A34APgfHff9cZp4pjZIGA+8Ba1/aTXEfrRi/F6HEW4sdWM0DB6wt1vNrNDCa3UDsDrwEXuvjVepbmV6nL5sbufWazXIvXvfjb1tjnwiLvfamYdyfLvigJdRCQh1OUiIpIQCnQRkYRQoIuIJIQCXUQkIRToIiIJoUAXEUkIBbqISEIo0EVEEuL/ANqC+73TbeQfAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "#绘制不同K对应的聚类的性能，找出最佳模型/参数（分数最大的）\n",
    "plt.plot(ks, np.array(CH_scores), 'b-', label = 'CH_scores')\n",
    "\n",
    "#最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis = None), len(CH_scores))\n",
    "best_k = ks[index[0]]\n",
    "print(best_k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "nbTranslate": {
   "displayLangs": [
    "zh-cn"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "zh-cn",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
